2025-12-02
3) Augmenting the data - Standardardized (pr. 100k) vs normal
Joining datasets:
The datasets contain TB cases for a given country. However, having a large population size (India and China), will also cause the overall TB cases to be larger for such countries.
Solution: Standardize the data! (I.e. amount of people with TB out of every pr. 100,000 citizen).
#Data augmentation - Calculating the TB cases pr. 100k (standardizing the data):
TB_age_sex <- TB_age_sex |>
group_by(country) |>
#Getting the total sum of TB cases for every country
mutate(
total_TB_cases_best = sum(TB_cases_best),
total_TB_cases_min = sum(TB_cases_min),
total_TB_cases_max = sum(TB_cases_max)
) |>
#calculate the pr. 100k amount of cases
mutate(
TB_cases_pr_100k_best = TB_cases_best/population_size*10^5,
TB_cases_pr_100k_min = TB_cases_min/population_size*10^5,
TB_cases_pr_100k_max = TB_cases_max/population_size*10^5,
)4) Analysis - 2024 dataset - TB + Risk factor distribution
Bar chart - TB + Risk factor cases in the top 10 most TB burdened countries:
#Code for making the plot of the TB + risk factor cases:
TB_age_sex |>
filter(country %in% top_10_countries_100k) |>
group_by(country, risk_factor) |>
filter(risk_factor != "no risk factor") |>
summarise(
TB_cases_best = sum(TB_cases_best),
TB_cases_min = sum(TB_cases_min),
TB_cases_max = sum(TB_cases_max),
population_size = first(population_size),
TB_cases_best_100k = TB_cases_best/population_size*10^5,
TB_cases_min_100k = TB_cases_min/population_size*10^5,
TB_cases_max_100k = TB_cases_max/population_size*10^5,
) #... followed by ggplotBar chart of the risk factor distribution: (Without the ‘no risk factor’ label)
Box plots of the global risk factor distribution:
TB_age_sex |>
group_by(country, risk_factor) |>
filter(risk_factor != "no risk factor") |>
summarise(
TB_cases_best = sum(TB_cases_best),
TB_cases_min = sum(TB_cases_min),
TB_cases_max = sum(TB_cases_max),
population_size = first(population_size),
TB_cases_best_100k = TB_cases_best/population_size*10^5,
TB_cases_min_100k = TB_cases_min/population_size*10^5,
TB_cases_max_100k = TB_cases_max/population_size*10^5,
) #... followed by ggplotBox plot of TB + risk factor cases, globally:
5) Analysis 2 - 2024 Dataset - TB sex distribution
TB_age_sex |>
group_by(country, sex) |>
summarise( #getting the summed TB case values for each country and sex.
TB_best_100k = sum(TB_cases_best)/first(population_size)*10^5, #Use first(),
# as the same value is repeated on several rows, for each country.
TB_min_100k = sum(TB_cases_min)/first(population_size)*10^5,
TB_max_100k = sum(TB_cases_max)/first(population_size)*10^5,
) |>
filter(country %in% top_10_countries_100k) |> #<-- The new change.
...# followed by ggplotBox Plot - TB sex distribution averaged for top 10 most burdened countries:
TB_age_sex |>
group_by(country, sex) |>
summarise( #getting the summed TB case values for each country and sex.
TB_best_100k = sum(TB_cases_best)/first(population_size)*10^5, #Use first(),
# as the same value is repeated on several rows, for each country.
TB_min_100k = sum(TB_cases_min)/first(population_size)*10^5,
TB_max_100k = sum(TB_cases_max)/first(population_size)*10^5,
) |>
filter(country %in% top_10_countries_100k) |>
ggplot(aes(x = TB_best_100k, y = country, color = sex)) +
geom_point(position = position_dodge(width = 0.5), size = 3) +
geom_errorbarh(
aes(xmin = TB_min_100k, xmax = TB_max_100k),
position = position_dodge(width = 0.5),
height = 0.4) +
labs(
x = "TB cases per 100k\n(Best estimate with min/max)",
y = "Country",
title = "TB cases pr. 100k by sex\n(Countries with top 10 most TB cases pr. 100k)") +
theme_minimal()Scatter plot of TB cases by sex in each top 10 country:
6) Analysis 2 - 2024 Dataset - TB age distribution
TB_age_sex |>
group_by(age_group) |>
summarise(
TB_best = sum(TB_cases_best), #Summing the amount of total TB cases for this group
TB_min = sum(TB_cases_min),
TB_max = sum(TB_cases_max),
) |>
mutate(
age_lower = as.numeric(str_extract(age_group, "^[0-9]+")),
age_lower = ifelse(is.na(age_lower), Inf, age_lower) #This part helps order the age group intervals correctly.
) |>
#Removing undesirable age groups.
filter(!age_group %in% c("15+", "18+", "all", "0-14")) |>
#...followed by ggplotBox Plot - Worldwide, TB per. 100k - age distribution:
TB_age_sex |>
group_by(age_group) |>
filter(country %in% top_10_countries_100k) |> #<-- New change
summarise(
TB_best = sum(TB_cases_best), #Summing the amount of total TB cases for this group
TB_min = sum(TB_cases_min),
TB_max = sum(TB_cases_max),
) |>
mutate(
age_lower = as.numeric(str_extract(age_group, "^[0-9]+")),
age_lower = ifelse(is.na(age_lower), Inf, age_lower) #This part helps order the age group intervals correctly.
) |>
filter(!age_group %in% c("15+", "18+", "all", "0-14")) |> #<--- New change! #Removing undesirable age groups.
#...followed by ggplotBox plot for top 10 most TB infected countries, total TB cases - with age groups
7) Analysis - Global TB evolution
8) Analysis - TB Preventive care data
gap_analysis <- TB_10_years_joined |>
filter(year == 2024) |>
filter(!is.na(Preventive_Tx_Pct) & !is.na(Contacts_best))
# 2. Define Dynamic Thresholds
# Burden Threshold: Top 10% of countries with the most contacts
burden_threshold <- quantile(gap_analysis$Contacts_best, 0.90)
# Coverage Threshold: Less than 40% coverage
coverage_threshold <- 40
# 3. Create a Flag for High Risk Countries
gap_analysis <- gap_analysis |>
mutate(
Is_High_Risk = if_else(Contacts_best >= burden_threshold & Preventive_Tx_Pct < coverage_threshold,
"High Risk (High Burden, Low Coverage)",
"Other")
)df_clean <- df |>
mutate(TPT_Contacts_Num = Contacts_best * Preventive_Tx_Pct / 100)
global_trends <- df_clean |>
filter(year >= 2015) |>
group_by(year) |>
summarise(
Total_TPT_Contacts = sum(TPT_Contacts_Num, na.rm = TRUE),
Total_Contacts_Eligible = sum(Contacts_best, na.rm = TRUE),
Avg_Child_Coverage = mean(Preventive_Tx_Kids_Pct, na.rm = TRUE)
) |>
mutate(Contact_Coverage = (Total_TPT_Contacts / Total_Contacts_Eligible) * 100)9) Analysis - RR-TB and HIV (2015 - 2024):
Scatter plot with linear regresion per region:
TB_10_years_joined |>
ggplot(aes(x=(rr_incidence/e_pop_num*10^5),y=e_mort_100k, color=g_whoregion.x)) +
geom_point(alpha = 0.45)+
geom_smooth(method='lm', se = FALSE)+
coord_cartesian(xlim=c(0,45), ylim=c(0,150)) + #Eye calculated good limit for the plot
scale_color_brewer(palette = "Set2",
labels = labels$label,
breaks = labels$g_whoregion.x) +
labs(
x = "RR - TB",
y = "Mortality",
title = "Relation between RR - TB and Mortality per region",
color = "WHO Region") +
theme_minimal()Box plots of mortality percentages based on different combinations of TB and HIV:
TB_10_years_joined |>
mutate(
disease = case_when(
e_tbhiv_prct > mean(e_tbhiv_prct, na.rm = TRUE) & rr_new > mean(rr_new, na.rm = TRUE) ~ "HIV & RR-TB",
e_tbhiv_prct > mean(e_tbhiv_prct, na.rm = TRUE) ~ "HIV & TB",
rr_new > mean(rr_new, na.rm = TRUE) ~ "Only RR-TB",
TRUE ~ "Only TB"
)) |>
ggplot(aes(x=disease, y =e_mort_100k, fill = disease))+
geom_boxplot() +
scale_fill_brewer(palette = "Set2") +
labs(
x = "",
y = "Mortality cases per 100k",
subtitle = "Mortality according to combinations of HIV and RR-TB",
fill = "Disease"
) +
theme_minimal()